In [2]:
#################################################
# EXPLORATORY DATA ANALYSIS of Air Traffic Data #
#################################################
In [3]:
# (and learning Pandas and Plotly libraries)
In [4]:
#__course__ = 'CS131A:Wostner:Fall2016'
#__author__ = 'mvarga2'
#__date__ = '2016-12-18'
In [5]:
# source file: 704969640_T_ONTIME_2016_10.csv
# data dictionary file: 704969640_T_ONTIME_ReadMe.csv
# period: October 2016
# data provider: Bureau of Transportation Statistics 
In [ ]:
# Table Of Contents:
#
# 1. Getting a sneak peek at the data
# 2. Calculating the measures of center and spread
### A. Analysing SFO outbound flights
### B. Analysing SFO inbound flights
# 3. Graphing the results - offline
# 4. Cleaning up
# 5. Findings and Conclusion
In [6]:
# 1. Getting a sneak peek at the data
In [7]:
# importing pandas library
import pandas as pd
In [8]:
# reading in source data from a csv file into df_airdata Pandas dataframe object
# please, check the location of the source file and specify its path accordingly
df_airdata = pd.read_csv('~/air_traffic_data/704969640_T_ONTIME_2016_10.csv')
#del df_airdata
In [9]:
# reading in a list of airport codes
df_airports=pd.read_csv('~/air_traffic_data/L_AIRPORT_ID.csv')
In [10]:
# gettting to know our data (similar to unix head command)
df_airdata.head()
Out[10]:
YEAR QUARTER MONTH DAY_OF_MONTH DAY_OF_WEEK FL_DATE UNIQUE_CARRIER AIRLINE_ID CARRIER ORIGIN_AIRPORT_ID ... DEP_DELAY DEP_DELAY_NEW ARR_DELAY ARR_DELAY_NEW CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY Unnamed: 20
0 2016 4 10 3 1 2016-10-03 AA 19805 AA 10721 ... -4.0 0.0 -6.0 0.0 NaN NaN NaN NaN NaN NaN
1 2016 4 10 4 2 2016-10-04 AA 19805 AA 10721 ... -6.0 0.0 -3.0 0.0 NaN NaN NaN NaN NaN NaN
2 2016 4 10 1 6 2016-10-01 AA 19805 AA 12478 ... -7.0 0.0 -30.0 0.0 NaN NaN NaN NaN NaN NaN
3 2016 4 10 2 7 2016-10-02 AA 19805 AA 12478 ... 17.0 17.0 18.0 18.0 17.0 0.0 1.0 0.0 0.0 NaN
4 2016 4 10 3 1 2016-10-03 AA 19805 AA 12478 ... -5.0 0.0 -10.0 0.0 NaN NaN NaN NaN NaN NaN

5 rows × 21 columns

In [11]:
# getting to know our data (similar to unix tail command)
df_airdata.tail()
Out[11]:
YEAR QUARTER MONTH DAY_OF_MONTH DAY_OF_WEEK FL_DATE UNIQUE_CARRIER AIRLINE_ID CARRIER ORIGIN_AIRPORT_ID ... DEP_DELAY DEP_DELAY_NEW ARR_DELAY ARR_DELAY_NEW CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY Unnamed: 20
472621 2016 4 10 31 1 2016-10-31 WN 19393 WN 15376 ... 7.0 7.0 4.0 4.0 NaN NaN NaN NaN NaN NaN
472622 2016 4 10 31 1 2016-10-31 WN 19393 WN 15376 ... -1.0 0.0 -5.0 0.0 NaN NaN NaN NaN NaN NaN
472623 2016 4 10 31 1 2016-10-31 WN 19393 WN 15376 ... 1.0 1.0 -20.0 0.0 NaN NaN NaN NaN NaN NaN
472624 2016 4 10 31 1 2016-10-31 WN 19393 WN 15376 ... -4.0 0.0 2.0 2.0 NaN NaN NaN NaN NaN NaN
472625 2016 4 10 31 1 2016-10-31 WN 19393 WN 15376 ... 11.0 11.0 12.0 12.0 NaN NaN NaN NaN NaN NaN

5 rows × 21 columns

In [12]:
# getting the number of columns in our dataframe
len(df_airdata.columns)
Out[12]:
21
In [13]:
# listing the names of all available columns
# df_airdata.columns
df_airdata.columns.values.tolist()
Out[13]:
['YEAR',
 'QUARTER',
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'FL_DATE',
 'UNIQUE_CARRIER',
 'AIRLINE_ID',
 'CARRIER',
 'ORIGIN_AIRPORT_ID',
 'DEST_AIRPORT_ID',
 'DEP_DELAY',
 'DEP_DELAY_NEW',
 'ARR_DELAY',
 'ARR_DELAY_NEW',
 'CARRIER_DELAY',
 'WEATHER_DELAY',
 'NAS_DELAY',
 'SECURITY_DELAY',
 'LATE_AIRCRAFT_DELAY',
 'Unnamed: 20']
In [14]:
# getting the columns' datatypes
df_airdata.dtypes
Out[14]:
YEAR                     int64
QUARTER                  int64
MONTH                    int64
DAY_OF_MONTH             int64
DAY_OF_WEEK              int64
FL_DATE                 object
UNIQUE_CARRIER          object
AIRLINE_ID               int64
CARRIER                 object
ORIGIN_AIRPORT_ID        int64
DEST_AIRPORT_ID          int64
DEP_DELAY              float64
DEP_DELAY_NEW          float64
ARR_DELAY              float64
ARR_DELAY_NEW          float64
CARRIER_DELAY          float64
WEATHER_DELAY          float64
NAS_DELAY              float64
SECURITY_DELAY         float64
LATE_AIRCRAFT_DELAY    float64
Unnamed: 20            float64
dtype: object
In [15]:
# generating summary statistics for each column of the whole dataframe 
df_airdata.describe()
Out[15]:
YEAR QUARTER MONTH DAY_OF_MONTH DAY_OF_WEEK AIRLINE_ID ORIGIN_AIRPORT_ID DEST_AIRPORT_ID DEP_DELAY DEP_DELAY_NEW ARR_DELAY ARR_DELAY_NEW CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY Unnamed: 20
count 472626.0 472626.0 472626.0 472626.000000 472626.000000 472626.000000 472626.000000 472626.000000 468150.000000 468150.000000 467486.000000 467486.000000 63470.000000 63470.000000 63470.000000 63470.000000 63470.000000 0.0
mean 2016.0 4.0 10.0 15.970203 3.983799 19902.495011 12669.914761 12669.457948 5.493946 8.647735 -0.223046 8.518910 18.915913 1.025177 14.930046 0.054041 20.431968 NaN
std 0.0 0.0 0.0 8.855973 2.069908 379.362355 1534.607695 1534.523927 33.110385 32.094842 34.918637 31.496976 55.845992 13.724331 31.688691 1.629245 41.506585 NaN
min 2016.0 4.0 10.0 1.000000 1.000000 19393.000000 10135.000000 10135.000000 -54.000000 0.000000 -76.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 NaN
25% 2016.0 4.0 10.0 8.000000 2.000000 19790.000000 11292.000000 11292.000000 -5.000000 0.000000 -15.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 NaN
50% 2016.0 4.0 10.0 16.000000 4.000000 19805.000000 12889.000000 12889.000000 -2.000000 0.000000 -7.000000 0.000000 1.000000 0.000000 3.000000 0.000000 0.000000 NaN
75% 2016.0 4.0 10.0 24.000000 6.000000 20304.000000 14027.000000 14027.000000 3.000000 3.000000 3.000000 3.000000 17.000000 0.000000 19.000000 0.000000 25.000000 NaN
max 2016.0 4.0 10.0 31.000000 7.000000 21171.000000 16218.000000 16218.000000 2149.000000 2149.000000 2142.000000 2142.000000 2142.000000 936.000000 1446.000000 155.000000 1432.000000 NaN
In [16]:
# getting total number of rows in our dataframe
len(df_airdata.index)
Out[16]:
472626
In [17]:
# printing first 5 rows of the airport codes dataframe - 5 is the default for both head and tail commands
df_airports.head()
Out[17]:
Code Description
0 10001 Afognak Lake, AK: Afognak Lake Airport
1 10003 Granite Mountain, AK: Bear Creek Mining Strip
2 10004 Lik, AK: Lik Mining Camp
3 10005 Little Squaw, AK: Little Squaw Airport
4 10006 Kizhuyak, AK: Kizhuyak Bay
In [18]:
# looking up a unique code representing San Francisco International Airport (SFO)
airport_code =  (df_airports[df_airports['Description']
            .str.contains("Francisco International")]
            .iloc[0]['Code']
        )
print(airport_code)
14771
In [19]:
# getting the number of records(rows) with only flights terminating at SFO 
len(df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code].index)
#df_airports[df_airports['Description'].str.contains("Francisco, CA")].iloc[:,[0,1]]
Out[19]:
14818
In [20]:
# getting the number of records with only flights originating at SFO
len(df_airdata.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code].index)
Out[20]:
14801
In [21]:
# printing only columns with specified indexes using iloc method 
df_airdata.iloc[:,[0,3,5]].head()
Out[21]:
YEAR DAY_OF_MONTH FL_DATE
0 2016 3 2016-10-03
1 2016 4 2016-10-04
2 2016 1 2016-10-01
3 2016 2 2016-10-02
4 2016 3 2016-10-03
In [22]:
# counting number of values in each column. note, that some columns have missing values for some of the rows, 
# hence the counts are different for the columns
df_airdata.count()
Out[22]:
YEAR                   472626
QUARTER                472626
MONTH                  472626
DAY_OF_MONTH           472626
DAY_OF_WEEK            472626
FL_DATE                472626
UNIQUE_CARRIER         472626
AIRLINE_ID             472626
CARRIER                472626
ORIGIN_AIRPORT_ID      472626
DEST_AIRPORT_ID        472626
DEP_DELAY              468150
DEP_DELAY_NEW          468150
ARR_DELAY              467486
ARR_DELAY_NEW          467486
CARRIER_DELAY           63470
WEATHER_DELAY           63470
NAS_DELAY               63470
SECURITY_DELAY          63470
LATE_AIRCRAFT_DELAY     63470
Unnamed: 20                 0
dtype: int64
In [23]:
# reporting the minimum flight date in our dataset (we are working with October data only)
df_airdata['FL_DATE'].min()
Out[23]:
'2016-10-01'
In [24]:
# reporting the maximum flight date in our dataset 
df_airdata['FL_DATE'].max()
Out[24]:
'2016-10-31'
In [25]:
# getting a quick glance at the data related to SFO as a destination airport using "isin" method and a list of values
airport_ids = [airport_code,'']
df_airdata.loc[df_airdata['DEST_AIRPORT_ID'].isin(airport_ids)].head()
Out[25]:
YEAR QUARTER MONTH DAY_OF_MONTH DAY_OF_WEEK FL_DATE UNIQUE_CARRIER AIRLINE_ID CARRIER ORIGIN_AIRPORT_ID ... DEP_DELAY DEP_DELAY_NEW ARR_DELAY ARR_DELAY_NEW CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY Unnamed: 20
275 2016 4 10 1 6 2016-10-01 AA 19805 AA 12478 ... -6.0 0.0 14.0 14.0 NaN NaN NaN NaN NaN NaN
276 2016 4 10 2 7 2016-10-02 AA 19805 AA 12478 ... -2.0 0.0 -6.0 0.0 NaN NaN NaN NaN NaN NaN
277 2016 4 10 3 1 2016-10-03 AA 19805 AA 12478 ... -6.0 0.0 3.0 3.0 NaN NaN NaN NaN NaN NaN
278 2016 4 10 4 2 2016-10-04 AA 19805 AA 12478 ... -6.0 0.0 -17.0 0.0 NaN NaN NaN NaN NaN NaN
279 2016 4 10 5 3 2016-10-05 AA 19805 AA 12478 ... -2.0 0.0 28.0 28.0 0.0 0.0 28.0 0.0 0.0 NaN

5 rows × 21 columns

In [25]:
 
In [26]:
# 2. Calculating the measures of center and spread
In [26]:
 
In [27]:
# A. Analysing SFO outbound flights 
# (i.e. flighs originating at SFO)
In [27]:
 
In [28]:
# finding the mean (average) value of departure delay for SFO outbound flights (in minutes)
(df_airdata
    .loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['DEP_DELAY']
    .mean()
    )
Out[28]:
15.433562071116656
In [29]:
# finding the standard deviation value of departure delay for SFO outbound flights (in minutes)
(df_airdata
    .loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['DEP_DELAY']
    .std()
    )
Out[29]:
45.60195922283215
In [30]:
# finding the median (middle) value of departure delay for SFO outbound flights (in minutes)
(df_airdata
    .loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['DEP_DELAY']
    .median()
    )
Out[30]:
-1.0
In [31]:
# finding the mode (most frequent) value of departure delay for SFO outbound flights (in minutes)
(df_airdata
    .loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['DEP_DELAY']
    .mode()
    )
Out[31]:
0   -4.0
dtype: float64
In [32]:
# finding the mean (average) value of arrival delay for SFO outbound flights (in minutes)
(df_airdata
    .loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['ARR_DELAY']
    .mean()
    )
Out[32]:
7.4388619014573214
In [33]:
# finding the standard deviation value of arrival delay for SFO outbound flights (in minutes)
(df_airdata
    .loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['ARR_DELAY']
    .std()
    )
Out[33]:
47.382536555618536
In [34]:
# finding the median (middle) value of arrival delay for SFO outbound flights (in minutes)
(df_airdata
    .loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['ARR_DELAY']
    .median()
    )
Out[34]:
-6.0
In [35]:
# finding the mode (most frequent) value of arrival delay for SFO outbound flights (in minutes)
(df_airdata
    .loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['ARR_DELAY']
    .mode()
    )
Out[35]:
0   -8.0
dtype: float64
In [36]:
# finding the outliers - maximum departure/arrival delay for SFO outbound flights (in minutes) 
(df_airdata
    .loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code][['DEP_DELAY','ARR_DELAY']]
    .max()
    )
Out[36]:
DEP_DELAY    2149.0
ARR_DELAY    2142.0
dtype: float64
In [37]:
# finding the outliers - minimum departure/arrival delay for SFO outbound flights (in minutes) 
(df_airdata
    .loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code][['DEP_DELAY','ARR_DELAY']]
    .min()
    )
Out[37]:
DEP_DELAY   -21.0
ARR_DELAY   -62.0
dtype: float64
In [38]:
# viewing first 10 rows sorted by arrival delay descending for SFO outbound flights (in minutes) 
(df_airdata
    .loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
    .sort_values(by=['ARR_DELAY'], ascending=[0])
    .head(10)
    )
Out[38]:
CARRIER DEP_DELAY ARR_DELAY
27920 AA 2149.0 2142.0
348809 UA 628.0 624.0
236600 HA 507.0 501.0
3808 AA 435.0 435.0
333452 UA 394.0 406.0
308788 UA 369.0 403.0
314966 UA 420.0 396.0
307933 UA 422.0 383.0
331681 UA 372.0 372.0
228488 F9 389.0 371.0
In [39]:
# viewing last 10 rows sorted by arrival delay for SFO outbound flights (in minutes) 
# if negative, the flight departed/arrived earlier than scheduled
(df_airdata.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
    .sort_values(by=['ARR_DELAY'], ascending=[1])
    .head(10)
    )
Out[39]:
CARRIER DEP_DELAY ARR_DELAY
150878 DL -8.0 -62.0
453 AA -5.0 -60.0
95448 B6 -5.0 -60.0
97445 B6 -13.0 -60.0
77499 AA -11.0 -58.0
5565 AA -4.0 -57.0
157031 DL -6.0 -57.0
3994 AA -8.0 -56.0
95289 B6 -1.0 -56.0
418 AA -6.0 -55.0
In [40]:
# computing departure and arrival delay averages by airline, sorting the output by arrival delay - descending  
df_delays_out_desc  = (df_airdata.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code ][['CARRIER','DEP_DELAY','ARR_DELAY']]
        #.groupby(['CARRIER'], as_index = False)
        .groupby(['CARRIER'])
        .mean()
        .reset_index()
        .sort_values(by=['ARR_DELAY'], ascending=[0])
        .head(5) # 5 is default
        )
df_delays_out_desc
Out[40]:
CARRIER DEP_DELAY ARR_DELAY
4 F9 25.719008 18.541322
9 WN 21.740364 17.738719
1 AS 12.529540 13.490153
8 VX 14.431235 9.968495
5 HA 13.693548 8.983871
In [41]:
# computing departure and arrival delay averages by airline, sorting the output by arrival delay - ascending
# displaying first five airlines with shortest average delays
# if negative, the flights departed/arrived on average earlier than scheduled
df_delays_out_asc  = (df_airdata.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
        .groupby(['CARRIER'], as_index = False)
        .mean()
        .round(2)
        .sort_values(by=['ARR_DELAY'], ascending=[1])
        .head())

df_delays_out_asc
Out[41]:
CARRIER DEP_DELAY ARR_DELAY
3 DL 9.35 -3.92
0 AA 10.69 -1.02
2 B6 16.08 5.26
6 OO 12.74 7.33
7 UA 18.04 7.74
In [41]:
 
In [42]:
# B. Analysing SFO inbound flights
# (i.e. flights terminating at SFO)
In [42]:
 
In [43]:
# finding the mean (average) value of arrival delay for SFO bound flights (in minutes)
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['ARR_DELAY']
    .mean()
    )
Out[43]:
19.410119378123266
In [44]:
# finding the standard deviation value of arrival delay for SFO bound flights (in minutes)
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['ARR_DELAY']
    .std()
    )
Out[44]:
55.126037055541417
In [45]:
# finding the median value of arrival delay for SFO bound flights (in minutes)
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['ARR_DELAY']
    .median()
    )
Out[45]:
1.0
In [46]:
# finding the mode value of departure delay for SFO outbound flights (in minutes)
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['ARR_DELAY']
    .mode()
    )
Out[46]:
0   -4.0
dtype: float64
In [47]:
# finding the mean value of departure delay for SFO bound flights (in minutes)
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['DEP_DELAY']
    .mean()
    )
Out[47]:
19.450096899224807
In [48]:
# finding the standard deviation value of departure delay for SFO bound flights (in minutes)
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['DEP_DELAY']
    .std()
    )
Out[48]:
51.764093969737466
In [49]:
# finding the median value of departure delay for SFO bound flights (in minutes)
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['DEP_DELAY']
    .median()
    )
Out[49]:
-1.0
In [50]:
# finding the mode value of departure delay for SFO bound flights (in minutes)
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['DEP_DELAY']
    .mode()
    )
Out[50]:
0   -5.0
dtype: float64
In [51]:
# summing up the average delay of flights headed to SFO (in minutes)
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['DEP_DELAY','ARR_DELAY']]
    .mean()
    .sum()
    )
# this number does not make sense, please ignore
Out[51]:
38.860216277348073
In [52]:
# finding the outliers - maximum arrival delay of SFO bound flights (in minutes) 
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['ARR_DELAY']
    .max()
    )
Out[52]:
1120.0
In [53]:
# finding the outliers - minimum arrival delay of SFO bound flights (in minutes)
(df_airdata
     .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['ARR_DELAY']
     .min()
     )
Out[53]:
-57.0
In [54]:
# printing maximum values of both departure and arrival delays for SFO bound flights (in minutes)
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['DEP_DELAY','ARR_DELAY']]
    .max()
    )
Out[54]:
DEP_DELAY    1143.0
ARR_DELAY    1120.0
dtype: float64
In [55]:
# printing mininum values of both departure and arrival delays for SFO bound flights (in minutes)
# if negative, the flight departed/arrived earlier than scheduled
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['DEP_DELAY','ARR_DELAY']]
    .min()
    )
Out[55]:
DEP_DELAY   -31.0
ARR_DELAY   -57.0
dtype: float64
In [56]:
# viewing first 10 rows sorted by arrival delay descending
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
    .sort_values(by=['ARR_DELAY'], ascending=[0])
    .head(10)
    )
Out[56]:
CARRIER DEP_DELAY ARR_DELAY
255232 OO 1143.0 1120.0
333533 UA 1038.0 1052.0
264324 OO 1006.0 996.0
365335 UA 682.0 667.0
65497 AA 605.0 619.0
276286 OO 622.0 615.0
78080 AS 539.0 548.0
312897 UA 500.0 502.0
332033 UA 507.0 490.0
294135 OO 479.0 483.0
In [57]:
# viewing last 10 rows sorted by arrival delay  
# if negative, the flight departed/arrived earlier than scheduled
(df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
    .sort_values(by=['ARR_DELAY'], ascending=[1])
    .head(10)
    )
Out[57]:
CARRIER DEP_DELAY ARR_DELAY
360165 VX -7.0 -57.0
355058 VX -7.0 -50.0
360364 VX -9.0 -49.0
110887 DL -2.0 -48.0
357197 VX -4.0 -47.0
358645 VX -7.0 -47.0
346260 UA -6.0 -46.0
356903 VX -1.0 -46.0
351682 VX -13.0 -44.0
112064 DL -4.0 -43.0
In [58]:
# computing departure and arrival delay averages by airline 
(df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
    .groupby(['CARRIER'])
    .mean()
    #.reset_index()
    )
Out[58]:
DEP_DELAY ARR_DELAY
CARRIER
AA 15.051245 19.977273
AS 15.794055 19.529412
B6 15.852391 16.617464
DL 16.019005 14.465158
F9 22.913580 29.702479
HA 12.209677 1.262295
OO 21.102718 20.908513
UA 18.820989 16.955486
VX 22.913703 21.796362
WN 23.259608 25.247820
In [59]:
# computing departure and arrival delay averages by airline, sorting the output by arrival delay - descending  
df_delays_in_desc  = (df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
        #.groupby(['CARRIER'], as_index = False)
        .groupby(['CARRIER'])
        .mean()
        .reset_index()
        .sort_values(by=['ARR_DELAY'], ascending=[0])
        .head(5) # 5 is default
        )
df_delays_in_desc
Out[59]:
CARRIER DEP_DELAY ARR_DELAY
4 F9 22.913580 29.702479
9 WN 23.259608 25.247820
8 VX 22.913703 21.796362
6 OO 21.102718 20.908513
0 AA 15.051245 19.977273
In [60]:
# computing departure and arrival delay averages by airline, sorting the output by arrival delay - descending 
# displaying first five airlines with longest average delays 
# second option is to use as_index = False clause instead of resetting index
df_delays_in_desc  = (df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
        .groupby(['CARRIER'], as_index = False)
        .mean()
        .sort_values(by=['ARR_DELAY'], ascending=[0])
        .head()
        )

df_delays_in_desc
Out[60]:
CARRIER DEP_DELAY ARR_DELAY
4 F9 22.913580 29.702479
9 WN 23.259608 25.247820
8 VX 22.913703 21.796362
6 OO 21.102718 20.908513
0 AA 15.051245 19.977273
In [61]:
# computing departure and arrival delay averages by airline, sorting the output by arrival delay - ascending
# displaying first five airlines with shortest average delays 
# if negative, the flights departed/arrived on average earlier than scheduled
df_delays_in_asc  = (df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
        .groupby(['CARRIER'], as_index = False)
        .mean()
        .round(2)
        .sort_values(by=['ARR_DELAY'], ascending=[1])
        .head())
#testdf['CARRIER','DEP_DELAY']
df_delays_in_asc
Out[61]:
CARRIER DEP_DELAY ARR_DELAY
5 HA 12.21 1.26
3 DL 16.02 14.47
2 B6 15.85 16.62
7 UA 18.82 16.96
1 AS 15.79 19.53
In [61]:
 
In [62]:
# finding duration of arrival delay causes in minutes
(df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][df_airdata['SECURITY_DELAY'] >= 0.0][[ 'ARR_DELAY','CARRIER_DELAY','WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']]
    .sum()
    #.reset_index()
    )
-c:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
Out[62]:
ARR_DELAY              340335.0
CARRIER_DELAY           45490.0
WEATHER_DELAY            7991.0
NAS_DELAY              212395.0
SECURITY_DELAY             71.0
LATE_AIRCRAFT_DELAY     74388.0
dtype: float64
In [63]:
# assigning sums of arrival delay causes to variables 
arr_del,car_del,wea_del,nas_del,sec_del,late_del = (df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][df_airdata['SECURITY_DELAY'] >= 0.0][[ 'ARR_DELAY','CARRIER_DELAY','WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']]
    .sum()
    )
In [64]:
# alternatively we can create a list with columns values as its elements using .tolist() method on our df_airdata dataframe
arr_del_sums = (df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][df_airdata['SECURITY_DELAY'] >= 0.0][['ARR_DELAY','CARRIER_DELAY','WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']]
    .sum()
    .tolist()
    )
arr_del_sums
Out[64]:
[340335.0, 45490.0, 7991.0, 212395.0, 71.0, 74388.0]
In [65]:
# confirming the sum of all sums of specific delays matches that of overall arrival delay
(df_airdata
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][df_airdata['SECURITY_DELAY'] >= 0.0][[ 'CARRIER_DELAY','WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']]
    .sum()
    .sum()
    )
-c:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
Out[65]:
340335.0
In [66]:
# 3. Graphing the results - offline
In [66]:
 
In [67]:
import plotly
from plotly.graph_objs import Scatter, Layout, Bar

plotly.offline.init_notebook_mode()

layout = Layout(
    title='Average SFO Arrival Delay by Airline',
    xaxis=dict(
        title='Airlines',
        titlefont=dict(
            family='Courier New, monospace',
            size=14,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Arrival delay (in minutes)',
        titlefont=dict(
            family='Courier New, monospace',
            size=14,
            color='#7f7f7f'
        )
    )
)

data = [Bar(x=df_delays_in_desc['CARRIER'], y=df_delays_in_desc['ARR_DELAY'])]

plotly.offline.iplot({ "data": data, "layout":layout})
#py.iplot(data, filename='basic-bar')
In [68]:
import plotly
from plotly.graph_objs import Scatter, Layout, Bar

plotly.offline.init_notebook_mode()

layout = Layout(
    title='Average SFO Departure Delay by Airline',
    xaxis=dict(
        title='Airlines',
        titlefont=dict(
            family='Courier New, monospace',
            size=14,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Departure delay (in minutes)',
        titlefont=dict(
            family='Courier New, monospace',
            size=14,
            color='#7f7f7f'
        )
    )
)

data = [Bar(x=df_delays_out_desc['CARRIER'], y=df_delays_out_desc['DEP_DELAY'])]

plotly.offline.iplot({ "data": data, "layout":layout})
#py.iplot(data, filename='basic-bar')
In [69]:
import plotly.plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode()

fig = {
    'data': [{'labels': ['Carrier Delay', 'Weather Delay', 'NAS Delay', 'Security Delay', 'Late Aircraft Delay'],
              'values': [car_del/arr_del, wea_del/arr_del, nas_del/arr_del, sec_del/arr_del, late_del/arr_del],
              'type': 'pie'}],
    'layout': {'title': 'Distribution of SFO Arrival Delay Causes - October 2016'}
     }

#plotly.offline.iplot({ "data": data, "layout":layout})
plotly.offline.iplot(fig)
In [70]:
# alternatively plotting the same graph using our list of arrival delay sums "arr_del_sums" instead 
plotly.offline.init_notebook_mode()

mylist=list()

for index,element in enumerate(arr_del_sums):
    if not index == 0:
        #print(element/arr_del_sums[0])
        mylist.append(element/arr_del_sums[0])

fig = {
    'data': [{'labels': ['Carrier Delay', 'Weather Delay', 'NAS Delay', 'Security Delay', 'Late Aircraft Delay'],
#              'values': [arr_del_sums[1]/arr_del_sums[0], arr_del_sums[2]/arr_del_sums[0], arr_del_sums[3]/arr_del_sums[0], arr_del_sums[4]/arr_del_sums[0], arr_del_sums[5]/arr_del_sums[0]],
              'values': mylist,  
              'type': 'pie'}],
    'layout': {'title': 'Distribution of SFO Arrival Delay Causes - October 2016'}
     }

#plotly.offline.iplot({ "data": data, "layout":layout})
plotly.offline.iplot(fig)
In [71]:
# creating a subset of data depicting airlines terminating at SFO ordered by average arrival delay descending
# alongside with number of inbound flights
df_airdata_sub  = (df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','ARR_DELAY']]
        .groupby(['CARRIER'], as_index = True)
        .agg(['mean','count'])
        .sort_values(by=[('ARR_DELAY','mean')], ascending=[0])
        )
df_airdata_sub.columns = df_airdata_sub.columns.droplevel(0)
df_airdata_sub = df_airdata_sub.reset_index()
df_airdata_sub
Out[71]:
CARRIER mean count
0 F9 29.702479 242
1 WN 25.247820 1376
2 VX 21.796362 1704
3 OO 20.908513 2831
4 AA 19.977273 1364
5 AS 19.529412 459
6 UA 16.955486 4785
7 B6 16.617464 481
8 DL 14.465158 1105
9 HA 1.262295 61
In [72]:
# creating a subset of 10 flights with longest delays alonside with carrier name and day of month of each flight 
df_top_delay = (df_airdata
    #.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY','DAY_OF_MONTH']]
    .loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','ARR_DELAY','DAY_OF_MONTH']] 
    .sort_values(by=['ARR_DELAY'], ascending=[0])
    .head(10)
    )
df_top_delay
Out[72]:
CARRIER ARR_DELAY DAY_OF_MONTH
255232 OO 1120.0 2
333533 UA 1052.0 13
264324 OO 996.0 9
365335 UA 667.0 2
65497 AA 619.0 24
276286 OO 615.0 16
78080 AS 548.0 16
312897 UA 502.0 27
332033 UA 490.0 13
294135 OO 483.0 25
In [73]:
# merging these two subsets together by CARRIER name.
df_airdata_sub_out = df_airdata_sub.merge(df_top_delay, on='CARRIER').sort_values(by=['ARR_DELAY'], ascending=[0])
df_airdata_sub_out
Out[73]:
CARRIER mean count ARR_DELAY DAY_OF_MONTH
0 OO 20.908513 2831 1120.0 2
6 UA 16.955486 4785 1052.0 13
1 OO 20.908513 2831 996.0 9
7 UA 16.955486 4785 667.0 2
4 AA 19.977273 1364 619.0 24
2 OO 20.908513 2831 615.0 16
5 AS 19.529412 459 548.0 16
8 UA 16.955486 4785 502.0 27
9 UA 16.955486 4785 490.0 13
3 OO 20.908513 2831 483.0 25
In [74]:
# printing a bubble chart
import plotly 
from plotly.graph_objs import Scatter, Layout

plotly.offline.init_notebook_mode()

data = [
    {
        'x': df_airdata_sub_out['DAY_OF_MONTH'],
        'y': df_airdata_sub_out['ARR_DELAY'],
        'text': df_airdata_sub_out['CARRIER'],
        'mode': 'markers',
        'marker': {
            'color': [120, 125, 130, 135, 140, 145, 150, 155, 160, 165 ],
            #'size': [15, 30, 55, 70, 90, 110,5, 30, 55, 70, 90],
            'size': df_airdata_sub_out['mean'] ,
            'sizeref':0.25 #,
           # 'showscale': True
        }
    }
]

layout = Layout(
    title=' Ten Longest Arrival Delays (in minutes)',
    xaxis=dict(
        title='Day Of Month',
        titlefont=dict(
            family='Courier New, monospace',
            size=14,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Arrival delay (in minutes)',
        titlefont=dict(
            family='Courier New, monospace',
            size=14,
            color='#7f7f7f'
        )
    )
)
plotly.offline.iplot({ "data": data, "layout":layout})
In [74]:
 
In [75]:
# 4. Cleaning up
In [75]:
 
In [76]:
del df_airdata
del df_delays_in_desc
del df_delays_in_asc
del df_delays_out_desc
del df_delays_out_asc
del df_airdata_sub
del df_top_delay
df_airdata_sub_out
Out[76]:
CARRIER mean count ARR_DELAY DAY_OF_MONTH
0 OO 20.908513 2831 1120.0 2
6 UA 16.955486 4785 1052.0 13
1 OO 20.908513 2831 996.0 9
7 UA 16.955486 4785 667.0 2
4 AA 19.977273 1364 619.0 24
2 OO 20.908513 2831 615.0 16
5 AS 19.529412 459 548.0 16
8 UA 16.955486 4785 502.0 27
9 UA 16.955486 4785 490.0 13
3 OO 20.908513 2831 483.0 25
In [76]:
 
In [77]:
# 5. Findings and Conclusion
In [77]:
 
In [78]:
# We have analyzed air traffic dataset for October 2016 provided by Bureau of Transportation Statistics
# available @http://www.transtats.bts.gov. We have only dealt with a subset of the data related to San Francisco 
# International Airport(SFO).
#
# Here are some of the findigs:
#
# 1. On average a flight originating from SFO in October 2016 had a delay of 15 minutes but 
# on average it arrived with only a 7.5 minute delay to its destination.
# 
# 2. The airline departing from SFO with the highest average arrival delay of 18 minutes is Frontier Airlines 
# closely followed by Southwest Airlines. On the other hand Delta Airlines has the lowest arrival delay average
# in  October 2016. In fact, on average it arrives ahead of time. American Airlines finished 2nd in terms of flight 
# accuracy.
#
# 3. Flights terminating at SFO have a delay of 19 minutes on average.
# 
# 4. The top average arrival delay of 29 minutes has been recorded again for Frontier Airlines.
# 
# 5. The lowest average arrival delay of 1.26 minutes for flights terminating at SFO was accomplished by Hawaiian Airlines, 
# Delta placed 2nd.  
#
# 6. The NAS(National Airspace System) can be blamed for the majority of SFO arrival delays, on average 62 percent of delays
# can be attributed to NAS
#
# Please, bear in mind that these findings may not be significant over longer period of time